In [1]:
# !apt install git-lfs
# !git lfs install
# !git clone https://huggingface.co/datasets/parambharat/malayalam_asr_corpus

# !add-apt-repository -y ppa:jonathonf/ffmpeg-4
# !apt update
# !apt install -y ffmpeg

# !pip uninstall -y transformers datasets 
# !pip install audiomentations
# !pip install git+https://github.com/huggingface/datasets
# !pip install git+https://github.com/huggingface/transformers
# !pip install librosa soundfile
# !pip install "evaluate>=0.3.0"
# !pip install jiwer
# !pip install more-itertools
# !pip install wandb
# !pip install bitsandbytes
# !pip install "bokeh<2.5.0"
# !pip install "holoviews[recommended]"
# !pip install pyarrow
In [2]:
%set_env WANDB_LOG_MODEL=True
%set_env WANDB_WATCH=all
%set_env WANDB_NOTEBOOK_NAME=whisper_small_ml.ipynb
env: WANDB_LOG_MODEL=True
env: WANDB_WATCH=all
env: WANDB_NOTEBOOK_NAME=whisper_small_ml.ipynb
In [3]:
import torch
from torch.utils.data import IterableDataset

from io import StringIO
import string
from dataclasses import dataclass
from typing import Any, Dict, List, Union

import wandb
from IPython.display import clear_output
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
import numpy as np
In [4]:
from transformers import WhisperForConditionalGeneration
from transformers import WhisperProcessor
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from transformers import WhisperTokenizer
from transformers import WhisperFeatureExtractor
from huggingface_hub import notebook_login
from transformers import TrainerCallback
from transformers.integrations import WandbCallback
from transformers.trainer_pt_utils import IterableDatasetShard
from datasets import Dataset, IterableDatasetDict, load_dataset, interleave_datasets, Audio 
from datasets import load_dataset, Audio
import evaluate
In [5]:
from pathlib import Path
import pandas as pd
import holoviews as hv
import panel as pn
import tempfile
from bokeh.resources import INLINE
import jiwer
In [6]:
hv.extension("bokeh", logo=False)
In [9]:
# wandb.login()
In [10]:
# notebook_login()
In [11]:
run = wandb.init(project="whisper_finetuning", job_type="fine-tuning", group="small-ml")
artifact = run.use_artifact('parambharat/whisper_finetuning/model-2dl0413q:latest', type='model')
artifact_dir = artifact.download()
wandb: Currently logged in as: parambharat. Use `wandb login --relogin` to force relogin
Tracking run with wandb version 0.13.6
Run data is saved locally in /home/ubuntu/whisper-finetuning/notebooks/wandb/run-20221212_191352-219trr2l
Syncing run wandering-shape-58 to Weights & Biases (docs)
wandb: Downloading large artifact model-2dl0413q:latest, 923.99MB. 10 files... 
wandb:   10 of 10 files downloaded.  
Done. 0:0:0.0
In [12]:
def load_data_splits(is_streaming=True, stopping_strategy="all_exhausted"):
    dataset_dict = {}
    
    data_dict = load_dataset("../data/malayalam_asr_corpus/", streaming=is_streaming)
        
    return data_dict
In [13]:
dataset_dict = load_data_splits()
In [14]:
augment_waveform = Compose([
    AddGaussianNoise(min_amplitude=0.005, max_amplitude=0.015, p=0.3),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.3, leave_length_unchanged=False),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.3)
    ,])

def augment_dataset(batch):

    audio = batch["audio"]["array"]
    # apply augmentation
    augmented_audio = augment_waveform(samples=audio, sample_rate=16000)

    batch["audio"]["array"] = augmented_audio

    return batch


# call augment dataset on the training set
dataset_dict["train"] = dataset_dict["train"].map(augment_dataset)
In [15]:
feature_extractor = WhisperFeatureExtractor.from_pretrained(
    "openai/whisper-small"
)
tokenizer = WhisperTokenizer.from_pretrained(
    "openai/whisper-small", 
     language="Malayalam",
     task="transcribe",
     model_max_length=225
)
processor = WhisperProcessor.from_pretrained(
    "openai/whisper-small",
     language="Malayalam", 
     task="transcribe",
     model_max_length=225
)
In [16]:
def fix_sentence(sentence):
    transcription = sentence
  
    if transcription.startswith('"') and transcription.endswith('"'):
        # we can remove trailing quotation marks as they do not affect the transcription
        transcription = transcription[1:-1]
  
    if transcription[-1] not in [".", "?", "!"]:
        # append a full-stop to sentences that do not end in punctuation
        transcription = transcription + "."
    transcription = transcription[:-1].translate(str.maketrans('', '', string.punctuation)) + transcription[-1]
    return transcription
    
def prepare_dataset(examples):
    # compute log-Mel input features from input audio array 
    audio = examples["audio"]
    
    examples["input_features"] = feature_extractor(
        audio["array"], sampling_rate=16000).input_features[0]
    
    sentences = fix_sentence(examples["sentence"])
    
    # encode target text to label ids 
    examples["labels"] = tokenizer(sentences, max_length=225, truncation=True).input_ids
    return examples
In [17]:
def filter_empty_strings(sentence):
    if len(sentence) < 2:
        return False
    else: return True
In [18]:
for k in dataset_dict:
    dataset_dict[k] = dataset_dict[k].filter(filter_empty_strings, input_columns=["sentence"])
In [19]:
for k in dataset_dict:
    dataset_dict[k] = dataset_dict[k].map(
        prepare_dataset,).with_format("torch")
In [20]:
dataset_dict["train"] = dataset_dict["train"].shuffle(buffer_size=500)
In [21]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": self.processor.tokenizer.truncate_sequences(feature["labels"])[0]}
                          for feature in features]
        # pad the labels to max length
        
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt",)

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch
In [22]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
In [23]:
metric = evaluate.load("wer")

# evaluate with the 'normalised' WER
do_normalize_eval = True


def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True, normalize=do_normalize_eval)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True, normalize=do_normalize_eval)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}
In [24]:
model = WhisperForConditionalGeneration.from_pretrained(artifact_dir, use_cache=False)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False
In [25]:
# trainer callback to reinitialise and reshuffle the streamable datasets at the beginning of each epoch
class ShuffleCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
        if isinstance(train_dataloader.dataset, IterableDatasetShard):
            pass  # set_epoch() is handled by the Trainer
        elif isinstance(train_dataloader.dataset, IterableDataset):
            train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)
            
In [26]:
def load_samples_dataset(dataset, num_samples=100):
    samples = []
    for i, item in enumerate(dataset):
        samples.append(item)
        if i == (num_samples-1):
            break
    sample_dataset = Dataset.from_list(samples)
    return sample_dataset

def compute_spectrograms(example):
    waveform =  example["audio"]["array"]
    specs = feature_extractor(waveform, sampling_rate=16000, padding="do_not_pad").input_features[0]
    return {"spectrogram": specs}


def record_to_html(sample_record):
    audio_array = np.array(sample_record["audio"]["array"])
    audio_sr = sample_record["audio"]["sampling_rate"]
    audio_duration = sample_record["length"]
    audio_spectrogram = np.array(sample_record["spectrogram"])

    bounds = (0,0, audio_duration, audio_spectrogram.max())

    waveform_int = np.int16(audio_array * 32767)

    
    
    hv_audio = pn.pane.Audio(waveform_int, sample_rate=audio_sr, name='Audio', throttle=500)
    
    slider = pn.widgets.FloatSlider(end=audio_duration, visible=False, step=0.001)
    line_audio = hv.VLine(0).opts(color='black')
    line_spec = hv.VLine(0).opts(color='red')
    
    
    slider.jslink(hv_audio, value='time', bidirectional=True)
    slider.jslink(line_audio, value='glyph.location')
    slider.jslink(line_spec, value='glyph.location')
    
    time = np.linspace(0, audio_duration, num=len(audio_array))
    line_plot_hv = hv.Curve(
        (time, audio_array), ["Time (s)", "amplitude"]).opts(
        width=500, height=150, axiswise=True) * line_audio
    
    hv_spec_gram = hv.Image(
        audio_spectrogram, bounds=(bounds), kdims=["Time (s)", "Frequency (hz)"]).opts(
        width=500, height=150, labelled=[], axiswise=True, color_levels=512)* line_spec
    
    
    combined = pn.Row(hv_audio, hv_spec_gram, line_plot_hv, slider)
    audio_html = StringIO()
    combined.save(audio_html)
    return audio_html


def dataset_to_records(dataset):
    records = []
    for item in dataset:
        record = {}
        record["audio_with_spec"] = wandb.Html(record_to_html(item))
        record["sentence"] = item["sentence"]
        record["length"] = item["length"]
        records.append(record)
    records = pd.DataFrame(records)
    return records
    
def decode_predictions(trainer, predictions):
    pred_ids = predictions.predictions
    pred_str = trainer.tokenizer.batch_decode(pred_ids, skip_special_tokens=True, )
    return pred_str


def compute_measures(predictions, labels):
    measures = [jiwer.compute_measures(ls, ps) for ps, ls in zip(predictions, labels)]
    measures_df = pd.DataFrame(measures)[["wer", "hits", "substitutions", "deletions", "insertions"]]
    return measures_df

class WandbProgressResultsCallback(WandbCallback):
    def __init__(self, trainer, sample_dataset): 
        super().__init__()
        self.trainer = trainer
        self.sample_dataset = sample_dataset
        self.records_df = dataset_to_records(sample_dataset)
        
    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
        super().on_log(args, state, control, model, logs)
        predictions = trainer.predict(self.sample_dataset)
        predictions = decode_predictions(self.trainer, predictions)
        measures_df = compute_measures(predictions, self.records_df["sentence"].tolist())
        records_df = pd.concat([self.records_df, measures_df], axis=1)
        records_df["prediction"] = predictions
        records_df["step"] = state.global_step
        records_table = self._wandb.Table(dataframe=records_df)
        self._wandb.log({"sample_predictions": records_table})
        
    def on_save(self, args, state, control, model=None, tokenizer=None, **kwargs):
        if self._wandb is None:
            return
        if self._log_model and self._initialized and state.is_world_process_zero:
            with tempfile.TemporaryDirectory() as temp_dir:
                self.trainer.save_model(temp_dir)
                metadata = (
                    {
                        k: v
                        for k, v in dict(self._wandb.summary).items()
                        if isinstance(v, numbers.Number) and not k.startswith("_")
                    }
                    if not args.load_best_model_at_end
                    else {
                        f"eval/{args.metric_for_best_model}": state.best_metric,
                        "train/total_floss": state.total_flos,
                    }
                )
                artifact = self._wandb.Artifact(
                    name=f"model-{self._wandb.run.id}",
                    type="model", metadata=metadata)
                for f in Path(temp_dir).glob("*"):
                    if f.is_file():
                        with artifact.new_file(f.name, mode="wb") as fa:
                            fa.write(f.read_bytes())
                self._wandb.run.log_artifact(artifact)
In [27]:
training_args = Seq2SeqTrainingArguments(
    output_dir="../models/whisper-small-ml",  # change to a repo name of your choice
    per_device_train_batch_size=64,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    save_total_limit=4,
    warmup_steps=500,
    max_steps=3000,
    gradient_checkpointing=True,
    fp16=True,
#     fp16_full_eval=True,
    optim="adamw_bnb_8bit",
    evaluation_strategy="steps",
    per_device_eval_batch_size=32,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=500,
    eval_steps=500,
    logging_steps=250,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
    hub_strategy="checkpoint",
    remove_unused_columns=False, 
    ignore_data_skip=True
)
In [28]:
samples_dataset = load_samples_dataset(dataset_dict["test"]).map(compute_spectrograms)
  0%|          | 0/100 [00:00<?, ?ex/s]
In [29]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset_dict["train"],
    eval_dataset=samples_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
    callbacks=[ShuffleCallback()],
    
)
Cloning https://huggingface.co/parambharat/whisper-small-ml into local empty directory.
Download file pytorch_model.bin:   0%|          | 32.0k/922M [00:00<?, ?B/s]
Download file training_args.bin: 100%|##########| 3.48k/3.48k [00:00<?, ?B/s]
Clean file training_args.bin:  29%|##8       | 1.00k/3.48k [00:00<?, ?B/s]
Clean file pytorch_model.bin:   0%|          | 1.00k/922M [00:00<?, ?B/s]
max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend
In [30]:
progress_callback = WandbProgressResultsCallback(trainer, samples_dataset)
clear_output()
In [31]:
trainer.add_callback(progress_callback)
In [32]:
model.save_pretrained(training_args.output_dir)
processor.save_pretrained(training_args.output_dir)
Configuration saved in ../models/whisper-small-ml/config.json
Model weights saved in ../models/whisper-small-ml/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
In [34]:
trainer.train()
***** Running training *****
  Num examples = 192000
  Num Epochs = 9223372036854775807
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 3000
  Number of trainable parameters = 241734912
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[3000/3000 11:21:45, Epoch 28/9223372036854775807]
Step Training Loss Validation Loss Wer
500 0.127500 0.163005 35.401460
1000 0.090000 0.182085 40.024331
1500 0.062000 0.200353 37.712895
2000 0.044100 0.210513 36.253041
2500 0.033500 0.224975 37.712895
3000 0.027600 0.230829 36.739659

[4/4 01:47]
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Evaluation *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
Saving model checkpoint to ../models/whisper-small-ml/checkpoint-500
Configuration saved in ../models/whisper-small-ml/checkpoint-500/config.json
Model weights saved in ../models/whisper-small-ml/checkpoint-500/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/checkpoint-500/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/checkpoint-500/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/checkpoint-500/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/checkpoint-500/added_tokens.json
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Saving model checkpoint to /tmp/tmp2wmi9emx
Configuration saved in /tmp/tmp2wmi9emx/config.json
Model weights saved in /tmp/tmp2wmi9emx/pytorch_model.bin
Feature extractor saved in /tmp/tmp2wmi9emx/preprocessor_config.json
tokenizer config file saved in /tmp/tmp2wmi9emx/tokenizer_config.json
Special tokens file saved in /tmp/tmp2wmi9emx/special_tokens_map.json
added tokens file saved in /tmp/tmp2wmi9emx/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ml
Configuration saved in ../models/whisper-small-ml/config.json
Model weights saved in ../models/whisper-small-ml/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin:   0%|          | 32.0k/922M [00:00<?, ?B/s]
Upload file last-checkpoint/optimizer.pt:   0%|          | 32.0k/700M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/parambharat/whisper-small-ml
   5d4a3d1..5d2c21c  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-small-ml
   5d2c21c..40e9491  main -> main

***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Evaluation *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
Saving model checkpoint to ../models/whisper-small-ml/checkpoint-1000
Configuration saved in ../models/whisper-small-ml/checkpoint-1000/config.json
Model weights saved in ../models/whisper-small-ml/checkpoint-1000/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/checkpoint-1000/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/checkpoint-1000/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/checkpoint-1000/added_tokens.json
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Saving model checkpoint to /tmp/tmpyxx0nddx
Configuration saved in /tmp/tmpyxx0nddx/config.json
Model weights saved in /tmp/tmpyxx0nddx/pytorch_model.bin
Feature extractor saved in /tmp/tmpyxx0nddx/preprocessor_config.json
tokenizer config file saved in /tmp/tmpyxx0nddx/tokenizer_config.json
Special tokens file saved in /tmp/tmpyxx0nddx/special_tokens_map.json
added tokens file saved in /tmp/tmpyxx0nddx/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ml
Configuration saved in ../models/whisper-small-ml/config.json
Model weights saved in ../models/whisper-small-ml/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin:   0%|          | 32.0k/922M [00:00<?, ?B/s]
Upload file last-checkpoint/optimizer.pt:   0%|          | 32.0k/700M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/parambharat/whisper-small-ml
   40e9491..e003b12  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-small-ml
   e003b12..f25dcc0  main -> main

***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Evaluation *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
Saving model checkpoint to ../models/whisper-small-ml/checkpoint-1500
Configuration saved in ../models/whisper-small-ml/checkpoint-1500/config.json
Model weights saved in ../models/whisper-small-ml/checkpoint-1500/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/checkpoint-1500/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/checkpoint-1500/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/checkpoint-1500/added_tokens.json
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Saving model checkpoint to /tmp/tmpoawgpo4v
Configuration saved in /tmp/tmpoawgpo4v/config.json
Model weights saved in /tmp/tmpoawgpo4v/pytorch_model.bin
Feature extractor saved in /tmp/tmpoawgpo4v/preprocessor_config.json
tokenizer config file saved in /tmp/tmpoawgpo4v/tokenizer_config.json
Special tokens file saved in /tmp/tmpoawgpo4v/special_tokens_map.json
added tokens file saved in /tmp/tmpoawgpo4v/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ml
Configuration saved in ../models/whisper-small-ml/config.json
Model weights saved in ../models/whisper-small-ml/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin:   0%|          | 32.0k/922M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/parambharat/whisper-small-ml
   f25dcc0..9426188  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-small-ml
   9426188..1d3689a  main -> main

***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Evaluation *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
Saving model checkpoint to ../models/whisper-small-ml/checkpoint-2000
Configuration saved in ../models/whisper-small-ml/checkpoint-2000/config.json
Model weights saved in ../models/whisper-small-ml/checkpoint-2000/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/checkpoint-2000/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/checkpoint-2000/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/checkpoint-2000/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/checkpoint-2000/added_tokens.json
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Saving model checkpoint to /tmp/tmpcpk221pg
Configuration saved in /tmp/tmpcpk221pg/config.json
Model weights saved in /tmp/tmpcpk221pg/pytorch_model.bin
Feature extractor saved in /tmp/tmpcpk221pg/preprocessor_config.json
tokenizer config file saved in /tmp/tmpcpk221pg/tokenizer_config.json
Special tokens file saved in /tmp/tmpcpk221pg/special_tokens_map.json
added tokens file saved in /tmp/tmpcpk221pg/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ml
Configuration saved in ../models/whisper-small-ml/config.json
Model weights saved in ../models/whisper-small-ml/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin:   0%|          | 32.0k/922M [00:00<?, ?B/s]
Upload file last-checkpoint/optimizer.pt:   0%|          | 32.0k/700M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/parambharat/whisper-small-ml
   1d3689a..5de9b65  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-small-ml
   5de9b65..9208d97  main -> main

***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Evaluation *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
Saving model checkpoint to ../models/whisper-small-ml/checkpoint-2500
Configuration saved in ../models/whisper-small-ml/checkpoint-2500/config.json
Model weights saved in ../models/whisper-small-ml/checkpoint-2500/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/checkpoint-2500/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/checkpoint-2500/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/checkpoint-2500/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/checkpoint-2500/added_tokens.json
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Deleting older checkpoint [../models/whisper-small-ml/checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to /tmp/tmp5t2kkkrw
Configuration saved in /tmp/tmp5t2kkkrw/config.json
Model weights saved in /tmp/tmp5t2kkkrw/pytorch_model.bin
Feature extractor saved in /tmp/tmp5t2kkkrw/preprocessor_config.json
tokenizer config file saved in /tmp/tmp5t2kkkrw/tokenizer_config.json
Special tokens file saved in /tmp/tmp5t2kkkrw/special_tokens_map.json
added tokens file saved in /tmp/tmp5t2kkkrw/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ml
Configuration saved in ../models/whisper-small-ml/config.json
Model weights saved in ../models/whisper-small-ml/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin:   0%|          | 32.0k/922M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/parambharat/whisper-small-ml
   9208d97..f4c20f9  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-small-ml
   f4c20f9..808e33c  main -> main

***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
***** Running Evaluation *****
  Num examples = 100
  Batch size = 32
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
Saving model checkpoint to ../models/whisper-small-ml/checkpoint-3000
Configuration saved in ../models/whisper-small-ml/checkpoint-3000/config.json
Model weights saved in ../models/whisper-small-ml/checkpoint-3000/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/checkpoint-3000/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/checkpoint-3000/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/checkpoint-3000/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/checkpoint-3000/added_tokens.json
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Deleting older checkpoint [../models/whisper-small-ml/checkpoint-1500] due to args.save_total_limit
Saving model checkpoint to /tmp/tmpxn6yu83x
Configuration saved in /tmp/tmpxn6yu83x/config.json
Model weights saved in /tmp/tmpxn6yu83x/pytorch_model.bin
Feature extractor saved in /tmp/tmpxn6yu83x/preprocessor_config.json
tokenizer config file saved in /tmp/tmpxn6yu83x/tokenizer_config.json
Special tokens file saved in /tmp/tmpxn6yu83x/special_tokens_map.json
added tokens file saved in /tmp/tmpxn6yu83x/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ml
Configuration saved in ../models/whisper-small-ml/config.json
Model weights saved in ../models/whisper-small-ml/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin:   0%|          | 32.0k/922M [00:00<?, ?B/s]
remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/parambharat/whisper-small-ml
   808e33c..ce0c6c5  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-small-ml
   ce0c6c5..cf560b9  main -> main



Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ../models/whisper-small-ml/checkpoint-500 (score: 35.4014598540146).
***** Running Prediction *****
  Num examples = 100
  Batch size = 32
/home/ubuntu/whisper-finetuning/notebooks/../models/whisper-small-ml is already a clone of https://huggingface.co/parambharat/whisper-small-ml. Make sure you pull the latest changes with `repo.git_pull()`.
max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend
Saving model checkpoint to /tmp/tmp6d21ms5i
Configuration saved in /tmp/tmp6d21ms5i/config.json
Model weights saved in /tmp/tmp6d21ms5i/pytorch_model.bin
Feature extractor saved in /tmp/tmp6d21ms5i/preprocessor_config.json
tokenizer config file saved in /tmp/tmp6d21ms5i/tokenizer_config.json
Special tokens file saved in /tmp/tmp6d21ms5i/special_tokens_map.json
added tokens file saved in /tmp/tmp6d21ms5i/added_tokens.json
Saving model checkpoint to ../models/whisper-small-ml
Configuration saved in ../models/whisper-small-ml/config.json
Model weights saved in ../models/whisper-small-ml/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/parambharat/whisper-small-ml
   cf560b9..e388abe  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}}
To https://huggingface.co/parambharat/whisper-small-ml
   e388abe..acd6019  main -> main

Out[34]:
TrainOutput(global_step=3000, training_loss=0.06887441301345826, metrics={'train_runtime': 40824.8754, 'train_samples_per_second': 4.703, 'train_steps_per_second': 0.073, 'total_flos': 5.501245769220096e+19, 'train_loss': 0.06887441301345826, 'epoch': 28.02})
In [38]:
def load_test_dataset(config = "fleurs"):
    if config == "fleurs":
        fleurs_test = load_dataset("google/fleurs", "ml_in", split="test", use_auth_token=True, streaming=True) 
        fleurs_test = fleurs_test.rename_column("transcription", "sentence")
        fleurs_test = fleurs_test.remove_columns(
            [col for col in fleurs_test.features.keys() if col not in ["audio", "sentence"]])
        fleurs_test = fleurs_test.cast_column("audio", Audio(sampling_rate=160000))
        test_dataset = fleurs_test.map(prepare_dataset,).with_format("torch")
        return test_dataset
    else:
        
        common_voice_test = load_dataset("mozilla-foundation/common_voice_11_0", "ml", split="test", use_auth_token=True, streaming=True)
        common_voice_test = common_voice_test.remove_columns(
            [col for col in common_voice_test.features.keys() if col not in ["audio", "sentence"]]).cast_column("audio", Audio(sampling_rate=160000))
        common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=160000))
        test_dataset = common_voice_test.map(prepare_dataset,).with_format("torch")
    return test_dataset    
In [39]:
test_dataset = load_test_dataset("common voice")
Downloading builder script:   0%|          | 0.00/8.30k [00:00<?, ?B/s]
Downloading readme:   0%|          | 0.00/12.2k [00:00<?, ?B/s]
Downloading extra modules:   0%|          | 0.00/3.44k [00:00<?, ?B/s]
Downloading extra modules:   0%|          | 0.00/60.9k [00:00<?, ?B/s]
In [40]:
trainer.evaluate(test_dataset)
***** Running Evaluation *****
  Num examples: Unknown
  Batch size = 32
Reading metadata...: 112it [00:00, 1358.16it/s]
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[40], line 1
----> 1 trainer.evaluate(test_dataset)

File ~/whisper_ft/lib/python3.8/site-packages/transformers/trainer_seq2seq.py:78, in Seq2SeqTrainer.evaluate(self, eval_dataset, ignore_keys, metric_key_prefix, **gen_kwargs)
     73 gen_kwargs["num_beams"] = (
     74     gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
     75 )
     76 self._gen_kwargs = gen_kwargs
---> 78 return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)

File ~/whisper_ft/lib/python3.8/site-packages/transformers/trainer.py:2818, in Trainer.evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
   2815 start_time = time.time()
   2817 eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-> 2818 output = eval_loop(
   2819     eval_dataloader,
   2820     description="Evaluation",
   2821     # No point gathering the predictions if there are no metrics, otherwise we defer to
   2822     # self.args.prediction_loss_only
   2823     prediction_loss_only=True if self.compute_metrics is None else None,
   2824     ignore_keys=ignore_keys,
   2825     metric_key_prefix=metric_key_prefix,
   2826 )
   2828 total_batch_size = self.args.eval_batch_size * self.args.world_size
   2829 if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:

File ~/whisper_ft/lib/python3.8/site-packages/transformers/trainer.py:3000, in Trainer.evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
   2997         batch_size = observed_batch_size
   2999 # Prediction step
-> 3000 loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
   3001 inputs_decode = self._prepare_input(inputs["input_ids"]) if args.include_inputs_for_metrics else None
   3003 if is_torch_tpu_available():

File ~/whisper_ft/lib/python3.8/site-packages/transformers/trainer_seq2seq.py:198, in Seq2SeqTrainer.prediction_step(self, model, inputs, prediction_loss_only, ignore_keys)
    195 else:
    196     generation_inputs = inputs[self.model.main_input_name]
--> 198 generated_tokens = self.model.generate(
    199     generation_inputs,
    200     **gen_kwargs,
    201 )
    202 # in case the batch is shorter than max length, the output should be padded
    203 if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]:

File ~/whisper_ft/lib/python3.8/site-packages/torch/autograd/grad_mode.py:27, in _DecoratorContextManager.__call__.<locals>.decorate_context(*args, **kwargs)
     24 @functools.wraps(func)
     25 def decorate_context(*args, **kwargs):
     26     with self.clone():
---> 27         return func(*args, **kwargs)

File ~/whisper_ft/lib/python3.8/site-packages/transformers/generation/utils.py:1518, in GenerationMixin.generate(self, inputs, max_length, min_length, do_sample, early_stopping, num_beams, temperature, penalty_alpha, top_k, top_p, typical_p, repetition_penalty, bad_words_ids, force_words_ids, bos_token_id, pad_token_id, eos_token_id, length_penalty, no_repeat_ngram_size, encoder_no_repeat_ngram_size, num_return_sequences, max_time, max_new_tokens, decoder_start_token_id, use_cache, num_beam_groups, diversity_penalty, prefix_allowed_tokens_fn, logits_processor, renormalize_logits, stopping_criteria, constraints, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, forced_bos_token_id, forced_eos_token_id, remove_invalid_values, synced_gpus, exponential_decay_length_penalty, suppress_tokens, begin_suppress_tokens, forced_decoder_ids, **model_kwargs)
   1513         raise ValueError(
   1514             f"num_return_sequences has to be 1, but is {num_return_sequences} when doing greedy search."
   1515         )
   1517     # 10. run greedy search
-> 1518     return self.greedy_search(
   1519         input_ids,
   1520         logits_processor=logits_processor,
   1521         stopping_criteria=stopping_criteria,
   1522         pad_token_id=pad_token_id,
   1523         eos_token_id=eos_token_id,
   1524         output_scores=output_scores,
   1525         return_dict_in_generate=return_dict_in_generate,
   1526         synced_gpus=synced_gpus,
   1527         **model_kwargs,
   1528     )
   1530 elif is_contrastive_search_gen_mode:
   1532     if num_return_sequences > 1:

File ~/whisper_ft/lib/python3.8/site-packages/transformers/generation/utils.py:2298, in GenerationMixin.greedy_search(self, input_ids, logits_processor, stopping_criteria, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, **model_kwargs)
   2295 next_token_logits = outputs.logits[:, -1, :]
   2297 # pre-process distribution
-> 2298 next_tokens_scores = logits_processor(input_ids, next_token_logits)
   2300 # Store scores, attentions and hidden_states when required
   2301 if return_dict_in_generate:

File ~/whisper_ft/lib/python3.8/site-packages/transformers/generation/logits_process.py:92, in LogitsProcessorList.__call__(self, input_ids, scores, **kwargs)
     90         scores = processor(input_ids, scores, **kwargs)
     91     else:
---> 92         scores = processor(input_ids, scores)
     93 return scores

File ~/whisper_ft/lib/python3.8/site-packages/transformers/generation/logits_process.py:733, in SuppressTokensLogitsProcessor.__call__(self, input_ids, scores)
    732 def __call__(self, input_ids, scores):
--> 733     scores[:, self.suppress_tokens] = -float("inf")
    734     return scores

KeyboardInterrupt: 
In [35]:
kwargs = {
#     "dataset_tags": "mozilla-foundation/common_voice_11_0",
#     "dataset": "Common Voice 11.0",  # a 'pretty' name for the training dataset
    "language": "ml",
    "model_name": "Whisper Small ML - Bharat Ramanathan",  # a 'pretty' name for your model
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
    "tags": "whisper-event",
}
In [36]:
trainer.push_to_hub(**kwargs)
Saving model checkpoint to ../models/whisper-small-ml
Configuration saved in ../models/whisper-small-ml/config.json
Model weights saved in ../models/whisper-small-ml/pytorch_model.bin
Feature extractor saved in ../models/whisper-small-ml/preprocessor_config.json
tokenizer config file saved in ../models/whisper-small-ml/tokenizer_config.json
Special tokens file saved in ../models/whisper-small-ml/special_tokens_map.json
added tokens file saved in ../models/whisper-small-ml/added_tokens.json
Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Automatic Speech Recognition', 'type': 'automatic-speech-recognition'}, 'metrics': [{'name': 'Wer', 'type': 'wer', 'value': 36.73965936739659}]}
To https://huggingface.co/parambharat/whisper-small-ml
   acd6019..973cfdb  main -> main

In [41]:
wandb.finish()
Waiting for W&B process to finish... (success).
VBox(children=(Label(value='5702.348 MB of 5702.348 MB uploaded (166.992 MB deduped)\r'), FloatProgress(value=…

Run history:


eval/loss▁▃▅▆▇█
eval/runtime▃█▅▇▄▁
eval/samples_per_second▆▁▄▂▅█
eval/steps_per_second▁▁▁▁▁█
eval/wer▁█▅▂▅▃
train/epoch▁▂▂▂▃▃▃▄▄▅▆▆▆▇▇▇███
train/global_step▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇███████
train/learning_rate▅█▇▇▆▅▅▄▃▂▂▁
train/loss█▇▆▅▄▃▃▂▂▁▁▁
train/total_flos▁
train/train_loss▁
train/train_runtime▁
train/train_samples_per_second▁
train/train_steps_per_second▁

Run summary:


eval/loss0.23083
eval/runtime149.6244
eval/samples_per_second0.668
eval/steps_per_second0.027
eval/wer36.73966
train/epoch28.02
train/global_step3000
train/learning_rate0.0
train/loss0.0276
train/total_flos5.501245769220096e+19
train/train_loss0.06887
train/train_runtime40824.8754
train/train_samples_per_second4.703
train/train_steps_per_second0.073

Synced wandering-shape-58: https://wandb.ai/parambharat/whisper_finetuning/runs/219trr2l
Synced 5 W&B file(s), 19 media file(s), 128 artifact file(s) and 0 other file(s)
Find logs at: ./wandb/run-20221212_191352-219trr2l/logs
In [1]:
from evaluate import push_to_hub
push_to_hub?
In [ ]: